In this workshop, we will learn how to visualize data using R. we will use the output generated in previous sessions, including abundance data from quantification and results statistical analysis.
We will look over several papers, e.g,
Analysis of host attachment and growth rates of CPR/DPANN organisms.)
.Heat map representing differentially abundant bacterial species (false discovery rate < 0.01) between HSPC and CRPC.
## A list of packages that we will use for this session
pkgs = c("tidyverse","ggplot2","phyloseq","ggpubr","stringr","dutchmasters","ggsci","devtools")
## Install the package if it has't been installed yet
for (i in pkgs){
if(! i %in% installed.packages()){
BiocManager::install(i, ask = F, update = F)
}
}
## Load all the packages
invisible(lapply(pkgs, function(x) library(x, character.only=TRUE)))
## Alterative way for smplot
devtools::install_github('smin95/smplot')
library(smplot)
## remove the intermediate variables
rm(pkgs, i)
## phyloseq object that you generated
load('phyloseq_object_relative_abundance_2022.Rdata')
## Results from statistical analysis
load('Alpha_diversity_by_read.Rdata')
phylo_obj
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 370 taxa and 46 samples ]
## sample_data() Sample Data: [ 46 samples by 5 sample variables ]
## tax_table() Taxonomy Table: [ 370 taxa by 7 taxonomic ranks ]
phylo_obj@sam_data[1:3,1:5]
## SampleID Condition Age Gender BMI
## ERR011273 ERR011273 Disease 48 Male 25
## ERR011275 ERR011275 Disease 62 Male 25
## ERR011277 ERR011277 Disease 49 Female 30
phylo_obj@otu_table[1:3,1:3]
## OTU Table: [3 taxa and 3 samples]
## taxa are rows
## ERR011273 ERR011275 ERR011277
## OTU 1 46 0 5
## OTU 2 10 0 1
## OTU 3 10 15 7
phylo_obj@tax_table[1:3,1:3]
## Taxonomy Table: [3 taxa by 3 taxonomic ranks]:
## Kingdom Phylum Class
## OTU 1 "k__Bacteria" "p__Bacteroidetes" "c__Bacteroidia"
## OTU 2 "k__Bacteria" "p__Bacteroidetes" "c__Bacteroidia"
## OTU 3 "k__Bacteria" "p__Firmicutes" "c__Clostridia"
pheatmap::pheatmap(phylo_obj@otu_table)
#e.g., only OTUs with a mean greater than 10^-5 are kept.
phylo_obj_filter = filter_taxa(phylo_obj, function(x) mean(x) > 1e-5, TRUE)
#number of OTUs in orignial phyloseq object.
dim(phylo_obj@otu_table)
## [1] 370 46
#number of OTUs after filtering
dim(phylo_obj_filter@otu_table)
## [1] 147 46
This results in a highly-subsetted object, phylo_obj_filter, containing just 147 of the original ~370 OTUs.
total = median(sample_sums(phylo_obj_filter))
abundant_OTU = filter_taxa(phylo_obj_filter, function(x) sum(x > total*0.20) > 0, TRUE)
abundant_OTU
## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 13 taxa and 46 samples ]
## sample_data() Sample Data: [ 46 samples by 5 sample variables ]
## tax_table() Taxonomy Table: [ 13 taxa by 7 taxonomic ranks ]
pd = psmelt(phylo_obj_filter)
pd_abundant = psmelt(abundant_OTU)
head(pd)
## OTU Sample Abundance SampleID Condition Age Gender BMI Kingdom
## 6693 OTU 95 ERR011287 66 ERR011287 Disease 38 Female 23 k__Bacteria
## 34 OTU 1 ERR011285 60 ERR011285 Disease 45 Male 27 k__Bacteria
## 4886 OTU 43 ERR011275 58 ERR011275 Disease 62 Male 25 k__Bacteria
## 26 OTU 1 ERR011299 50 ERR011299 Disease 37 Male 31 k__Bacteria
## 1 OTU 1 ERR011273 46 ERR011273 Disease 48 Male 25 k__Bacteria
## 24 OTU 1 ERR011281 44 ERR011281 Disease 62 Female 35 k__Bacteria
## Phylum Class Order Family
## 6693 p__Bacteroidetes c__Bacteroidia o__Bacteroidales f__Bacteroidaceae
## 34 p__Bacteroidetes c__Bacteroidia o__Bacteroidales f__Prevotellaceae
## 4886 p__Firmicutes c__Clostridia o__Clostridiales f__Lachnospiraceae
## 26 p__Bacteroidetes c__Bacteroidia o__Bacteroidales f__Prevotellaceae
## 1 p__Bacteroidetes c__Bacteroidia o__Bacteroidales f__Prevotellaceae
## 24 p__Bacteroidetes c__Bacteroidia o__Bacteroidales f__Prevotellaceae
## Genus Species
## 6693 g__Bacteroides s__Bacteroides_eggerthii
## 34 g__Prevotella s__Prevotella_copri
## 4886 g__Roseburia s__Roseburia_faecis
## 26 g__Prevotella s__Prevotella_copri
## 1 g__Prevotella s__Prevotella_copri
## 24 g__Prevotella s__Prevotella_copri
6762 obs. of 15 variables
pd = pd %>% mutate(Kingdom = str_replace(Kingdom, 'k__',''),
Phylum = str_replace(Phylum, 'p__',''),
Class = str_replace(Class, 'c__',''),
Order = str_replace(Order, 'o__',''),
Family = str_replace(Family, 'f__',''),
Genus = str_replace(Genus, 'g__',''),
Species = str_replace(Species, 's__',''))
pd[1:3,9:12]
## Kingdom Phylum Class Order
## 6693 Bacteria Bacteroidetes Bacteroidia Bacteroidales
## 34 Bacteria Bacteroidetes Bacteroidia Bacteroidales
## 4886 Bacteria Firmicutes Clostridia Clostridiales
head(Adiv)
## Observed Chao1 se.chao1 ACE se.ACE Shannon Simpson InvSimpson
## ERR011273 88 88 0 NaN NaN 2.207899 0.7169744 3.533250
## ERR011275 78 78 0 NaN NaN 1.680507 0.5969508 2.481087
## ERR011277 88 88 0 NaN NaN 3.439262 0.9434077 17.670261
## ERR011279 107 107 0 NaN NaN 3.313319 0.9423581 17.348499
## ERR011281 55 55 0 NaN NaN 2.377746 0.7761130 4.466538
## ERR011283 83 83 0 NaN NaN 2.342265 0.7793263 4.531578
Adiv_meta = Adiv %>% mutate(SampleID = rownames(.)) %>%
right_join(SampleData, by = 'SampleID') %>%
select(SampleID, everything())
head(Adiv_meta)
## SampleID Observed Chao1 se.chao1 ACE se.ACE Shannon Simpson InvSimpson
## 1 ERR011273 88 88 0 NaN NaN 2.207899 0.7169744 3.533250
## 2 ERR011275 78 78 0 NaN NaN 1.680507 0.5969508 2.481087
## 3 ERR011277 88 88 0 NaN NaN 3.439262 0.9434077 17.670261
## 4 ERR011279 107 107 0 NaN NaN 3.313319 0.9423581 17.348499
## 5 ERR011281 55 55 0 NaN NaN 2.377746 0.7761130 4.466538
## 6 ERR011283 83 83 0 NaN NaN 2.342265 0.7793263 4.531578
## Condition Age Gender BMI
## 1 Disease 48 Male 25
## 2 Disease 62 Male 25
## 3 Disease 49 Female 30
## 4 Disease 63 Female 28
## 5 Disease 62 Female 35
## 6 Disease 41 Female 28
colnames(Adiv_meta)
## [1] "SampleID" "Observed" "Chao1" "se.chao1" "ACE"
## [6] "se.ACE" "Shannon" "Simpson" "InvSimpson" "Condition"
## [11] "Age" "Gender" "BMI"
## Create a basic layer by setting up the x- and y-axis of graph
p1 = ggplot(Adiv_meta, aes(x = SampleID, y = BMI))
## Add the second layer, like barplot
p2 = ggplot(Adiv_meta, aes(x = SampleID, y = BMI)) + geom_bar(stat = 'identity')
## Add the second layer, like barplot (side-by-side)
p3 = ggplot(Adiv_meta, aes(x = SampleID, y = BMI, fill = Gender)) +
geom_bar(stat = 'identity', position = position_dodge())
## Add the second layer, like barplot (stacked)
p4 = ggplot(Adiv_meta, aes(x = Condition, y = BMI, fill = Gender)) +
geom_bar(stat = 'identity',position = "fill")
## Add the second layer, like boxplot
p5 = ggplot(Adiv_meta, aes(x = Condition, y = BMI)) +
geom_boxplot()
## Add the second layer, like heatmap
p6 = ggplot(Adiv_meta, aes(x = SampleID, y = '')) +
geom_tile(aes(fill = BMI))
ggarrange(p1,p2,p3,p4,p5,p6, labels = c("p1", "p2", "p3", "p4","p5","p6"))
# Add your code below:
Q: how to improve these graphs, in case:
you name it!
### create a graph using default theme
p = ggplot(Adiv_meta, aes(x = Condition, y = BMI)) +
geom_boxplot() +
ggtitle('default theme')
## add the third layer with customized theme
## change the theme and check the difference between these existed themes
p1 = p + theme_bw() + ggtitle('theme_bw()')
p2 = p + theme_linedraw() + ggtitle('theme_linedraw()')
p3 = p + theme_classic() + ggtitle('theme_classic()')
p4 = p + theme_dark() + ggtitle('theme_dark()')
p5 = p + theme_minimal() + ggtitle('theme_minimal()')
### arrange above plots into one and display
ggarrange(p,p1,p2,p3,p4,p5, labels = c("p", "p1","p2","p3","p4","p5"))
To modify an individual theme component you use code like plot + theme(element.name = element_function()).
element_text(), will show the text in the graph
element_blank(), will remove the text in the graph
For example:
p = ggplot(Adiv_meta, aes(x = Condition, y = BMI)) +
geom_boxplot() +
theme_classic() +
ggtitle('theme_classic()')
# add argument inside of geom_boxplot() function, e.g, fill and width
p1 = ggplot(Adiv_meta, aes(x = Condition, y = BMI)) +
geom_boxplot(aes(fill = Condition)) +
theme_classic() +
ggtitle('change width and color of box')
# in case, the length of text is too long, we can rotate the label.
# change the color of title and text in x-axis
# customize this argument within theme function: axis.title.x or axis.title.y
# for example:
p2 = ggplot(Adiv_meta, aes(x = Condition, y = BMI)) +
geom_boxplot(aes(fill = Condition)) +
theme_classic() +
ggtitle('change color of x.title') +
theme(axis.title.x = element_text(colour = 'red'))
# Multiple modification
# Modify the graph by
# change the color of title in x-axis
# rotate x-axis text
# change the legend position and
p3 = ggplot(Adiv_meta, aes(x = Condition, y = BMI)) +
geom_boxplot(aes(fill = Condition)) +
theme_classic() +
ggtitle('after mutiple modification') +
theme(axis.title.x = element_text(colour = 'red'),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5, colour = 'red'),
legend.position = 'left') # can be in left, bottom, top, or none
ggarrange(p, p1, p2, p3, labels = c("p", "p1", "p2","p3"))
colnames(pd)
## [1] "OTU" "Sample" "Abundance" "SampleID" "Condition" "Age"
## [7] "Gender" "BMI" "Kingdom" "Phylum" "Class" "Order"
## [13] "Family" "Genus" "Species"
ggplot(data = pd, aes(x = Sample, y = Abundance, fill = Class)) +
geom_bar(stat = 'identity',position = "fill")
how to solve the staked x-axis label?
# add your code here:
how to change the float to percentage on y-axis label?
# add your code here:
ggplot(data = pd, aes(x = Gender, y = Abundance)) +
geom_bar(stat = 'identity',position = "fill", aes(fill = Class)) +
theme_classic2() +
facet_grid(. ~ Condition, scales="free")
length(unique(pd$Class))
## [1] 15
So, we need to supply 15 different colors to the corresponding bacterial at class level. - colors in R
ggplot(data = pd, aes(x = Sample, y = Abundance)) +
geom_bar(stat = 'identity',position = "fill", aes(fill = Class), width = 2) +
theme_classic2() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 6)) +
facet_wrap(. ~ Condition, scales="free") +
scale_fill_manual(values = terrain.colors(15))
## Warning: position_stack requires non-overlapping x intervals
## position_stack requires non-overlapping x intervals
# you can also use rainbow(15) or heat.colors(15) or topo.colors(15) or terrain.colors(15)
#length(unique(pd$Phylum))
#6
p = ggplot(data = pd, aes(x = Condition, y = Abundance)) +
geom_bar(stat = 'identity', position = "fill", aes(fill = Phylum),width = 0.8) +
theme_classic() +
scale_fill_simpsons()
p1 = ggplot(data = pd, aes(x = Condition, y = Abundance)) +
geom_bar(stat = 'identity', position = "fill", aes(fill = Phylum),width = 0.8) +
theme_classic() +
scale_fill_uchicago()
ggarrange(p,p1, nrow = 2)
Create a side-by-side barplot showing the relative abundance of bacterial at Kingdom level and modify the color, which are commonly used in the paper published in Lancet journals.
# add your code below:
p1 = ggplot(data = Adiv_meta, aes(x = Condition, y = Shannon)) +
geom_boxplot(aes(fill = Condition)) +
geom_jitter() +
theme_classic()+
scale_fill_jco() +
stat_compare_means(method = "wilcox.test") +
scale_y_continuous(expand = c(0.01, 0.2))
p2 = ggplot(data = Adiv_meta, aes(x = Condition, y = Shannon)) +
geom_violin(aes(fill = Condition)) +
theme_classic()+
scale_fill_jco() +
stat_compare_means(method = "wilcox.test") +
scale_y_continuous(expand = c(0.01, 0.2))
p3 = ggplot(data = Adiv_meta, aes(x = Condition, y = Chao1)) +
geom_boxplot(aes(fill = Condition)) +
theme_classic()+
scale_fill_uchicago() +
stat_compare_means(method = "wilcox.test",aes(label = ..p.signif..)) +
scale_y_continuous(expand = c(0.01, 20))
p4 = ggplot(data = Adiv_meta, aes(x = Condition, y = Chao1)) +
geom_violin(aes(fill = Condition)) +
theme_bw()+
facet_grid(.~ Gender) +
scale_fill_uchicago() +
stat_compare_means(method = "wilcox.test",aes(label = ..p.signif..)) +
scale_y_continuous(expand = c(0.01, 20))
ggarrange(p1, p2,p3,p4, labels = c("p1","p2","p3","p4"), nrow = 2, ncol = 2)
How to evaluate the difference of Shannon/Chao1 between Gender in each condition, significant or not?
# add your code below:
cor.test(Adiv_meta$BMI, Adiv_meta$Age, method = 'spearman')
## Warning in cor.test.default(Adiv_meta$BMI, Adiv_meta$Age, method = "spearman"):
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: Adiv_meta$BMI and Adiv_meta$Age
## S = 8217.3, p-value = 0.0004962
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.4932301
# add your code below:
cor.test(Adiv_meta$BMI, Adiv_meta$Shannon, method = 'spearman')
## Warning in cor.test.default(Adiv_meta$BMI, Adiv_meta$Shannon, method =
## "spearman"): Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: Adiv_meta$BMI and Adiv_meta$Shannon
## S = 16887, p-value = 0.7845
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.04143346
p1 = ggplot(data = Adiv_meta, aes(x = BMI, y = Age)) +
geom_point(shape = 21, fill = '#0f993d', color = 'white', size = 3) +
sm_corr_theme() +
sm_statCorr(corr_method = "spearman")
p2 = ggplot(data = Adiv_meta, aes(x = BMI, y = Shannon)) +
geom_point(shape = 21, fill = 'purple', color = 'white', size = 3) +
sm_corr_theme() +
sm_statCorr(corr_method = "spearman")
ggarrange(p1, p2, labels = c("p1","p2"))
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
how about the correlation between Age and Shannon index
# Add your code below:
pd_abundant = pd_abundant %>% mutate(Kingdom = str_replace(Kingdom, 'k__',''),
Phylum = str_replace(Phylum, 'p__',''),
Class = str_replace(Class, 'c__',''),
Order = str_replace(Order, 'o__',''),
Family = str_replace(Family, 'f__',''),
Genus = str_replace(Genus, 'g__',''),
Species = str_replace(Species, 's__',''))
ggplot(data = pd_abundant, aes(x=SampleID, y=Species, fill = log10(Abundance+1))) +
geom_tile() +
theme_classic() +
facet_grid(.~ Condition, scales = 'free') +
scale_fill_gradient2(midpoint = 0.8, mid = 'white',low = '#131D5C', high = '#CE3633') +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank()) +
rremove('x.ticks')
# plotting and saving as a object in R (e.g, fig_a)
fig_a = ggplot(Adiv_meta, aes(x = Condition, y = BMI)) +
geom_boxplot(aes(fill = Condition)) +
geom_jitter() +
theme_bw() +
ggtitle('BMI in different condition')
# at least 4 arguments should be given to ggsave() function
# first, a object that represents the graph you generate, (e.g., fig_a in the case)
# second, file name and the folder you will save the file
# third and forth are about width and height of graph, respectively
ggsave(fig_a, filename = 'Figures/00.BMI.condition.pdf', width = 4, height = 3)
# you can save the graph in other version
# ggsave() support : "eps", "ps", "tex" (pictex), "pdf", "jpeg", "tiff", "png", "bmp", "svg"
fig_b = ggplot(data = pd_abundant, aes(x=SampleID, y=Species, fill = log10(Abundance+1))) +
geom_tile() +
theme_classic() +
facet_grid(.~ Condition, scales = 'free') +
scale_fill_gradient2(midpoint = 0.8, mid = 'white',low = '#131D5C', high = '#CE3633') +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
legend.position = 'bottom') +
rremove('x.ticks') +
ggtitle('Abundant species across samples')
combined_fig = ggarrange(fig_a, fig_b, labels = c("A","B"))
ggsave(combined_fig, filename = 'Figures/00.combined_fig.pdf', width = 10, height = 5)
-End, thank you!!